Pre-processing wiki text

Filter by list of pages of interest

In [ ]:
import pandas as pd
import numpy as np

Get list of pages from catscan

In [ ]:
df = pd.read_csv('data/materials-pages', error_bad_lines = False)
In [ ]:
df.head()
In [ ]:
pageDict = {" ".join(page.split('_')):0 for page in df['title']}
In [ ]:
pageDict

JSON pre-processed dump

Search for articles in dump by title

In [ ]:
#this works in python 2.7 but not in 3.6
#try finding files by title
import os
import bz2
import json
import time

t0 = time.time()

if 0==0:
    writefile = "data/wiki_proc_filt.json"
    written = 0
    with open(writefile, 'w') as fwrite:
        dirs = os.listdir('data/wiki_processed')
        for folder in dirs:
            filelist = os.listdir("data//wiki_processed//"+folder)
            for filename in filelist: 
                filepath = "data//wiki_processed//" + "//".join([folder, filename])
                with bz2.BZ2File(filepath, 'r') as fread:
                    for line in fread:
                        d = json.loads(line)
                        if d['title'] in pageDict:
                            written +=1
                            fwrite.write(line)       


    print("Searched for " + str(len(pageDict)) + " titles.")
    print(str(written) + " titles found and written to " + writefile)
    
t = time.time()
print("time taken = " + str(t-t0) + "s.")

Search for articles in dump by "id"

Our list of articles in the category 'materials' with depth = 5 had 197530 articles. Searching by title, only 164600 of those were found in the processed wikipedia dump. This might be because the catscan is more recent than the dump.

Let's try to search instead by "id" rather than "title".

In [ ]:
#this works in python 2.7 but not in 3.6
import time

t0 = time.time()

#make the if statement true if you want to run this. May take a few minutes. 

if 1==1:

    idDict = {id:0 for id in df["pageid"]}
    writefile = "data/wiki_proc_filt_id.json"
    writtenId = 0
    with open(writefile, 'w') as fwrite:
        dirs = os.listdir('data/wiki_processed')
        for folder in dirs:
            filelist = os.listdir("data//wiki_processed//"+folder)
            for filename in filelist: 
                filepath = "data//wiki_processed//" + "//".join([folder, filename])
                with bz2.BZ2File(filepath, 'r') as fread:
                    for line in fread:
                        d = json.loads(line)
                        if int(d['id']) in idDict:
                            writtenId +=1
                            fwrite.write(line)
                                
    
    print("Searched for " + str(len(idDict)) + " titles.")
    print(str(writtenId) + " titles found and written to " + writefile)
    
t = time.time()
print("Time taken: " + str(t-t0) + "s")

Even after searching by id, about 27000 articles are missing. We will go ahead and process the articles we could find.

There is one complication in the dump I have processed. We need the dump with the links, but the one I have is without links. The idea behind preserving the links is that we should convert phrases within a link into a single entity - an n-gram. So below I obtained a different pre-processed dump.

Analyzing another pre-processed dump

This file is tab separated. Each section is written as a new entry. Also contains references.

  1. If there's a link, join words within the link by underscores to use it as a phrase
  2. Write the output to a new file in json format
  3. Join sections so that each article is one entry.
  4. Add a new entry in the json file with section names for each article
In [ ]:
import re

def join_links(example):
    pattern = re.compile(r'(<a href\")(.*?)(\">)(.*?)(</a>)')
    matches = pattern.findall(example)
    for match in matches:
        #print(match)
        t = ''.join(x for x in match)
        example = example.replace(t, match[1].replace(" ","_"))
    return example
    #print(example)
In [ ]:
readfile = open("data//wiki_materials_rows_out.txt", 'r')
tempfile = open("data/link_joined.txt", 'w')
t0 = time.time()
i=0
for line in readfile:
    line = join_links(line)
    tempfile.write(line)
    
t = time.time()
print("Time taken = " + str(t-t0) + "s.")
tempfile.close()

Let's take a look at how the file looks now

In [ ]:
readfile = open("data//link_joined.txt", 'r')

i=0
for line in readfile:
    if i<2:
        example=line
    elif i<5:
        print(line)
    else:   
        break
    i+=1
    
readfile.close()

For ease of working, convert the tab separated .txt file into json file

In [ ]:
t0 = time.time()
lines_read = 0
lines_write = 0
with open("data/link_joined.txt", 'r') as readfile:
    with open("data/link_joined.json", 'w') as writefile:
        for line in readfile: 
            lines_read+=1
            data = line.split('\t')
            if len(data)==6:
                lines_write += 1
                line = {"id" : data[0], "title" : data[1], "section" : data[2], "text" : data[5]} 
                d = json.dumps(line)
                writefile.write(d+'\n')
print(str(lines_read) + " lines read.")
print(str(lines_write) + " lines written. ")
                
t = time.time()
print("time taken = " + str(t-t0) + "s.")

Make sure we still have all the articles we started with ~166000

In [ ]:
title_dict = {}
with open("data/link_joined.json") as f_r: # | wc -l
    i = 0
    for line in f_r:
        #if i>10:
        #    break
        d = json.loads(line)  
        title_dict[d['title']] = 1
        i+=1
        
print(len(title_dict))

Fuse text for same title together Assume that text from the same article is consecutive. Once an article ends, it never restarts again

In [ ]:
#n_line = 0
t0 = time.time()
titles = 0
n_line = 1
with open("data/link_joined.json", 'r') as readfile:
    with open("data/titles_joined.json", 'w') as writefile:
        for line in readfile:
            if 0==0:
                d = json.loads(line)
                n_line += 1
                if n_line==1:
                    id_prev = d["id"]
                    title_prev = d["title"]
                    text_prev = d["text"]
                    section_prev = d["section"]
                    sections = d["section"] 
                    #new_title = 0
                elif d["id"] == id_prev:
                    new_title = 0
                    text_prev = text_prev + "\n" + d["text"]
                    if d["section"] != section_prev: 
                        sections = sections + "," + d["section"]
                else:
                    titles+=1
                    #new_title = 1
                    #print(title_prev)
                    writeline = {"id": id_prev, "title": title_prev, "section": sections, "text": text_prev}
                    writeline = json.dumps(writeline)
                    writefile.write(writeline + '\n')
                    id_prev = d["id"]
                    title_prev = d["title"]
                    text_prev = d["text"]
                    section_prev = d["section"]
                    sections = d["section"] 

print(str(titles) + " written.")
t = time.time()
print("time taken = " + str(t-t0) + "s.")

Make a list of titles

In [ ]:
titles = {}
with open('data/titles_joined.json', 'r') as readfile:
    for line in readfile:
        d = json.loads(line)
        title = d['title'].replace(" ", "_")
        titles[title] = 0
len(titles)
In [ ]:
titles.keys()[5]

Get categories for each title from dbpedia

WikiExtractor does not import the category information for each article by default. However dbpedia stores a lot of structured information about each wikipedia article such as category names, templates, info-boxes etc.

Let's get the categories info from dbpedia.

In [ ]:
from collections import defaultdict
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('error')
t0 = time.time()
titles_cat = defaultdict(list)
with open('data/article_categories_en.ttl', 'r') as readfile:
    n=0
    for line in readfile:
        if n==0 or len(line.split(">"))<2:
            n+=1
            continue
        n=n+1
        t_db = line.split("<http://dbpedia.org/resource/")[1].split(">")[0]
        t_db = t_db.decode('utf-8')

        if t_db in titles: 
            cat = line.split("<http://dbpedia.org/resource/")[2].split(":")[1].split(">")[0]
            titles_cat[t_db].append(cat)            
        if 0==1:
            break
t = time.time()
print("Extracted categories for " + str(len(titles_cat.keys())) + " titles in " + str(t-t0) + "s.")
In [ ]:
notfound = []
for key in titles.keys():
    if key in titles_cat:
        continue
    else:
        notfound.append(key)
In [ ]:
for x in notfound:
    print(x)
    #print(x.encode('ascii', 'ignore'))
In [ ]:
with open('data/titles_cat.json', 'w') as writefile:
    json.dump(titles_cat, writefile)
In [ ]:
t0 = time.time()
titles = 0
with open('data/wiki_final_processed.json', 'w') as writefile:
    with open ('data/titles_joined.json', 'r') as readfile:
        for line in readfile:
            d = json.loads(line)
            d['title'] = d['title'].replace(" ", "_")
            if d['title'] in titles_cat:
                titles += 1
                d['category'] = titles_cat[d['title']]
                writeline = json.dumps(d)
                writefile.write(writeline + '\n')
t = time.time()
print("Wrote categories for " + str(titles) + " articles in " + str(t-t0) + "s.")

We have a list of articles under each category. Let's also get a list of categories for each article, by storing in a data structure called default dict.

In [1]:
import json
from collections import defaultdict
ids = []
categories = []
n=0
cat_titles = defaultdict(list)
with open('data/wiki_final_processed.json','r') as readfile:
    for line in readfile:
        d = json.loads(line)
        n+=1
        for category in d['category']:
            cat_titles[category].append(d['title'])

Store categories and article names in a pandas dataframe

We will store this in a pandas dataframe for easy filtering and querying.

In [ ]:
cats = pd.DataFrame.from_dict(cat_items, orient = 'index')
In [ ]:
stemmer2.stem('gardening')
In [91]:
cats_df = pd.DataFrame(list(cat_titles.items()), columns=['category', 'articles'])
cats_df.head()
Out[91]:
category articles
0 Climate_forcing [Albedo, Greenhouse_effect, Global_warming_pot...
1 Climatology [Albedo, Drought, Extreme_weather, Weathering,...
2 Electromagnetic_radiation [Albedo, Aberration_of_light, Beer–Lambert_law...
3 Land_surface_effects_on_climate [Albedo, Rain_shadow, Land_surface_effects_on_...
4 Radiometry [Albedo, Crookes_radiometer, Radiometry, Pyrom...

Sort the dataframe based on the number of articles in each category, so that we can focus on categories with the most articles.

In [92]:
cats_df['n_articles'] = [len(cats_df.iloc[i][1]) for i in range(len(cats_df))]
cats_df.sort_values('n_articles', axis=0, ascending=False, inplace=True, na_position='last')
In [77]:
cats_df.to_csv('data\cats_articles', index=False, header = True)
#cats_df = pd.read_csv('data\cats_articles')
cats_df.head()
Out[77]:
category articles n_articles
0 Living_people ['Anatoly_Karpov', 'Anita_Hill', 'Dale_Chihuly... 3536
1 American_films ['The_Birth_of_a_Nation', 'King_Kong_(1933_fil... 3438
2 American_black-and-white_films ['The_Birth_of_a_Nation', 'King_Kong_(1933_fil... 3036
3 Enzymes_of_unknown_structure ['L-lactate_dehydrogenase_(cytochrome)', '1,4-... 2087
4 English-language_films ['King_Kong_(1933_film)', 'U-571_(film)', 'Cro... 1964
In [49]:
cats_df.iloc[:300, [0,2]].to_csv('data\cats_articles300', index = False)

Tokenize with Spacy and save to file

In [120]:
import spacy
import time
import json
In [2]:
nlp = spacy.load('en')
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
In [3]:
t0 = time.time()
count = 0

with open('data/wiki_final_processed.json', 'r') as readfile:
    with open('data/wiki_tokenized.json', 'w') as writefile:
        for line in readfile:
            d = json.loads(line)
            tokens = [str(token) for token in tokenizer(d['text'])]
            d['text'] = " ".join(tokens)
            writefile.write(json.dumps(d) + '\n') 
            count += 1
            if count%10000 == 0:
                print(d['text'][:30], tokens[:4])
                print("Tokenized", count, "articles")
t= time.time()
print(t-t0)
Lucky Jim is a novel by Kingsl ['Lucky', 'Jim', 'is', 'a']
Tokenized 10000 articles
A hydraulic press is a device  ['A', 'hydraulic', 'press', 'is']
Tokenized 20000 articles
Inertron is a fictional metal  ['Inertron', 'is', 'a', 'fictional']
Tokenized 30000 articles
A focusing screen is a flat tr ['A', 'focusing', 'screen', 'is']
Tokenized 40000 articles
Ray-Ban Wayfarer sunglasses ha ['Ray-Ban', 'Wayfarer', 'sunglasses', 'have']
Tokenized 50000 articles
A gate dielectric is a dielect ['A', 'gate', 'dielectric', 'is']
Tokenized 60000 articles
Lyrick Studios was an American ['Lyrick', 'Studios', 'was', 'an']
Tokenized 70000 articles
In enzymology, an adenosylmeth ['In', 'enzymology,', 'an', 'adenosylmethionine']
Tokenized 80000 articles
Dancing on Coral is a Miles_Fr ['Dancing', 'on', 'Coral', 'is']
Tokenized 90000 articles
Difluorocarbene is the chemica ['Difluorocarbene', 'is', 'the', 'chemical_compound']
Tokenized 100000 articles
6-(2-Aminopropyl)-2,3-dihydrob ['6-(2-Aminopropyl)-2,3-dihydrobenzofuran', '(6-APDB,', '4-Desoxy-MDA,', 'EMA-3)']
Tokenized 110000 articles
Cariporide is a selective Na+/ ['Cariporide', 'is', 'a', 'selective']
Tokenized 120000 articles
Baker-Miller Pink is a tone of ['Baker-Miller', 'Pink', 'is', 'a']
Tokenized 130000 articles
Galloping Bungalows: The Rise  ['Galloping', 'Bungalows:', 'The', 'Rise']
Tokenized 140000 articles
The following is a list of cou ['The', 'following', 'is', 'a']
Tokenized 150000 articles
1296.5032212734222

Stemming with NLTK

In [16]:
import nltk
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Astha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [18]:
stemmer.stem('running')
Out[18]:
'run'

Data pre-processing for a 5-class chemical-physics classifier

From among the 300 most frequent categories, I have picked 5 categories to train a classifier on. For a single label classifier, let's make sure that the categories don't share a significant number of articles.

In [78]:
choose = ['Fluid_dynamics', 'Quantum_mechanics', 'Condensed_matter_physics', 'Limestone_caves', 'Acoustics']
In [111]:
#from sets import Set
intersect = []
for i, cat_1 in enumerate(choose):
    for j, cat_2 in enumerate(choose):
        if i<=j:
            continue
        art_1 = set(cats_df[cats_df['category' ] == cat_1]['articles'].tolist()[0])
        #print(art_1)
        art_2 = set(cats_df[cats_df['category' ] == cat_2]['articles'].tolist()[0])
        #intersect.append(len(list(art_1.intersection(art_2))))
        print(cat_1, cat_2, len(list(art_1.intersection(art_2))))
    
Quantum_mechanics Fluid_dynamics 0
Condensed_matter_physics Fluid_dynamics 2
Condensed_matter_physics Quantum_mechanics 17
Limestone_caves Fluid_dynamics 0
Limestone_caves Quantum_mechanics 0
Limestone_caves Condensed_matter_physics 0
Acoustics Fluid_dynamics 8
Acoustics Quantum_mechanics 0
Acoustics Condensed_matter_physics 0
Acoustics Limestone_caves 0

Some categories have some common articles, but less than 10% are common. Notably, :

  1. Condensed_matter_physics and Quantum_mechanics have the highest # of articles in common.
  2. Limestone_caves is the most separate category from the remaining 4.

Let's write all the tokenized text for this selected set of articles to a file.

In [116]:
arts = defaultdict(list)
for cat in choose:
    articles = cats_df[cats_df['category']==cat]['articles'].tolist()[0]
    for art in articles:
        arts[art].append(cat)
In [108]:
cats_df[cats_df['category']==choose[0]]['articles'].tolist()[0]
Out[108]:
['Acoustic_theory',
 'Cavitation',
 'Fluid',
 'Fluid_dynamics',
 'Jet_engine',
 'Microfluidics',
 'Mach_number',
 'Pressure',
 'Superfluid_helium-4',
 'Drop_(liquid)',
 'Viscometer',
 'Power_number',
 'Soliton',
 'Strouhal_number',
 'Péclet_number',
 'Aerosol',
 'Prandtl_number',
 'Nusselt_number',
 'Grashof_number',
 'Potential_flow',
 'Rayleigh_number',
 "Bernoulli's_principle",
 'Non-Newtonian_fluid',
 'Deborah_number',
 'Supercavitation',
 'Surface_tension',
 'Speed_of_sound',
 'Turbulence',
 'Vortex',
 'Gravity_wave',
 'Mean_flow',
 'Atmospheric_sciences',
 'Vorticity',
 'Compressibility',
 'Baroclinity',
 'Drag_coefficient',
 'Magnetohydrodynamic_drive',
 "Stokes'_law",
 'Stream_function',
 'Capillary_action',
 'Flow_measurement',
 'Wake',
 'Adverse_pressure_gradient',
 'Hydraulic_fluid',
 'Weissenberg_number',
 'Pitch_drop_experiment',
 'Terminal_velocity',
 'Wave_drag',
 'Superheating',
 'Hydraulic_jump',
 'Inflatable',
 'Magnus_effect',
 "Archimedes'_principle",
 'Soundproofing',
 'Laplace_number',
 'Damköhler_numbers',
 'Archimedes_number',
 'Electro-osmosis',
 'Newtonian_fluid',
 'Parasitic_drag',
 'Soap_bubble',
 'Richardson_number',
 'Boussinesq_approximation_(buoyancy)',
 'Froude_number',
 'Gravity_current',
 'Shower-curtain_effect',
 'Knudsen_number',
 'Mach_wave',
 'Volumetric_flow_rate',
 'Solenoidal_vector_field',
 'Siphon',
 "D'Alembert's_paradox",
 'Fluid_parcel',
 'Ekman_number',
 "Helmholtz's_theorems",
 'Antibubble',
 'Blade_element_theory',
 'Geostrophic_wind',
 'Ventilation_(architecture)',
 'Magnetorheological_fluid',
 'Tidal_bore',
 'Kelvin–Helmholtz_instability',
 'Hydraulic_diameter',
 'Venturi_effect',
 'Rotameter',
 'Reynolds-averaged_Navier–Stokes_equations',
 'Hydraulic_accumulator',
 'Two-phase_flow',
 'Material_derivative',
 'Capillary_wave',
 'Homentropic_flow',
 'Rogue_wave',
 'Circulation_(fluid_dynamics)',
 'Ocean_gyre',
 'Smart_fluid',
 'Atmospheric_physics',
 'Thixotropy',
 'Dilatant',
 'Pressure_coefficient',
 'Internal_wave',
 'Riabouchinsky_solid',
 'Fluidics',
 'Supercritical_flow',
 'Taylor–Proudman_theorem',
 'Particle_image_velocimetry',
 'Hydrodynamical_helicity',
 'Injector',
 'Large_eddy_simulation',
 'Pressure-gradient_force',
 'Convective_available_potential_energy',
 'Splash_(fluid_mechanics)',
 'Electrohydrodynamics',
 'P-wave',
 'Hydraulic_machinery',
 'Hull_speed',
 'Slime',
 'Impeller',
 'Vortex_shedding',
 'Barotropic_fluid',
 'Faraday_wave',
 'Velocimetry',
 'Hydrant',
 'Ekman_spiral',
 'Bulbous_bow',
 'Bow_wave',
 'Traffic_wave',
 'Dispersion_(water_waves)',
 'Rayleigh–Taylor_instability',
 'Momentum_theory',
 'Thermal_hydraulics',
 'Thermal_loop',
 'Turbidity_current',
 'Pneumatic_motor',
 'Complex_fluid',
 'Orifice_plate',
 'Hydraulic_head',
 'Graetz_number',
 'Ram_pressure',
 'Plug_flow',
 'Hydroxyl_tagging_velocimetry',
 'Weber_number',
 'Stokes_flow',
 'Mach_tuck',
 'Drag_(physics)',
 'Sauter_mean_diameter',
 'Reyn',
 'History_of_fluid_mechanics',
 'Stagnation_temperature',
 'Stagnation_point',
 'Mass_flow_rate',
 'Sonication',
 'Manning_formula',
 'Open-channel_flow',
 'Foil_(fluid_mechanics)',
 'Rheometer',
 'Taylor_number',
 'Taylor–Couette_flow',
 'Aeroacoustics',
 'Roughness_length',
 'Enstrophy',
 'Brunt–Väisälä_frequency',
 'Fanning_friction_factor',
 'Momentum_diffusion',
 'Drag_count',
 'Fluid_pipe',
 'Natural_circulation',
 'Oblique_shock',
 'Rayleigh_flow',
 'Isothermal_flow',
 'Metafluid_dynamics',
 'Stagnation_pressure',
 'Added_mass',
 'Lewis_number',
 'Schmidt_number',
 'Hamiltonian_fluid_mechanics',
 'Stokes_radius',
 'Agitator_(device)',
 'Mass_flow',
 'Sonochemistry',
 'Magnetic_Reynolds_number',
 'Hagen–Poiseuille_flow_from_the_Navier–Stokes_equations',
 'Shear_thinning',
 'Plume_(fluid_dynamics)',
 'Pneumatic_gripper',
 'Entrainment_(hydrodynamics)',
 "Mariotte's_bottle",
 'Reynolds_transport_theorem',
 'Upper-convected_time_derivative',
 'Plastometer',
 'Pneumatic_cylinder',
 'Rankine_body',
 'Ludwieg_tube',
 'Hypersonic_wind_tunnel',
 'Supersonic_wind_tunnel',
 'Stanton_number',
 'Euler_number_(physics)',
 'Direct_numerical_simulation',
 "Torricelli's_law",
 'Molecular_tagging_velocimetry',
 'No-slip_condition',
 'Static_pressure',
 'Capillary_pressure',
 'Boiler_feedwater_pump',
 'Max_Q',
 'Float_switch',
 'Atkinson_friction_factor',
 'Kutta_condition',
 'Eddy_(fluid_dynamics)',
 'Atkinson_resistance',
 'Blasius_boundary_layer',
 'Boojum_(superfluidity)',
 'Gas_kinetics',
 'Particle_tracking_velocimetry',
 'Galilei_number',
 'Kaye_effect',
 'Rheoscopic_fluid',
 'Flow_separation',
 'Law_of_the_wall',
 'Wetted_perimeter',
 'Kitchen_rudder',
 'Pleuger_rudder',
 'Flow_visualization',
 'Shear_flow',
 'Perrin_friction_factors',
 'Intrinsic_viscosity',
 'Vacuum_coffee_maker',
 'Beta_plane',
 'Lagrangian_and_Eulerian_specification_of_the_flow_field',
 'Monin–Obukhov_length',
 'Ekman_transport',
 'Specific_speed',
 'MUSCL_scheme',
 'Ohnesorge_number',
 'Inviscid_flow',
 'Dynamic_pressure',
 'Navier–Stokes_existence_and_smoothness',
 'Fluid–structure_interaction',
 'Womersley_number',
 'Blast_wave',
 'Rotating_tank',
 'Seeding_(fluid_dynamics)',
 'Airlift_pump',
 'Metering_pump',
 'Kalliroscope',
 'Prandtl–Meyer_expansion_fan',
 'Wave-making_resistance',
 "Kelvin's_circulation_theorem",
 'Friction_loss',
 'Pressure_head',
 'Richtmyer–Meshkov_instability',
 'Plateau–Rayleigh_instability',
 'Pipe_network_analysis',
 'Roshko_number',
 'Natural_ventilation',
 'Flow_coefficient',
 'Stagnation_enthalpy',
 'Streamline_diffusion',
 'Pneumatic_barrier',
 'Inspirator',
 'Moving_shock',
 'Kutta–Joukowski_theorem',
 'Jet_(fluid)',
 'Relative_permeability',
 'Young–Laplace_equation',
 'Prandtl–Meyer_function',
 'Marangoni_number',
 'Shock_(fluid_dynamics)',
 'Planar_laser-induced_fluorescence',
 'Ledinegg_instability',
 'Froude–Krylov_force',
 'Stream_thrust_averaging',
 'Riemann_problem',
 'Flettner_rotor',
 'Cunningham_correction_factor',
 'Drag_crisis',
 'Compressed_fluid',
 'Lubrication_theory',
 'Papkovich–Neuber_solution',
 'Orr–Sommerfeld_equation',
 'Synthetic_schlieren',
 'Nanofluidics',
 'Slosh_dynamics',
 'Laplace_formula',
 'Lubricity',
 'Fluid_Dynamics_Prize_(APS)',
 'Total_pressure',
 'Flow_tracer',
 'Theory_of_tides',
 'Baldwin–Lomax_model',
 'Cebeci–Smith_model',
 'Mach_reflection',
 'Film_temperature',
 'Jet_noise',
 'Turbulent_Prandtl_number',
 'Airflow_Sciences_Corporation',
 'Flow_velocity',
 'Magnetorotational_instability',
 'Capillary_length',
 'Dean_number',
 'Taylor_dispersion',
 'Pressure-correction_method',
 'Slender-body_theory',
 'Positive_displacement_meter',
 'Base_conditions',
 'Flow_conditions',
 'Film-forming_agent',
 'Hydroelasticity',
 'Viscimetry',
 'Küssner_effect',
 'Trisonic_Wind_Tunnel_(El_Segundo,_California)',
 'Synthetic_jet',
 'Sod_shock_tube',
 "Stokes'_law_of_sound_attenuation",
 'Volume_viscosity',
 'Free_molecular_flow',
 'Extensional_viscosity',
 'OpenFOAM',
 'End_correction',
 'Capillary_surface',
 'HydroEngine',
 'Surface_force',
 'Background-oriented_schlieren_technique',
 'Gladstone–Dale_relation',
 'Maximum_allowable_operating_pressure',
 'Secondary_flow',
 'Boussinesq_approximation_(water_waves)',
 'Natural_convection',
 'Impact_pressure',
 'Atwood_number',
 'Stokes_drift',
 'Overflow_(software)',
 'Stokes_wave',
 'Wind_stress',
 'List_of_hydrodynamic_instabilities_named_after_people',
 'List_of_waves_named_after_people',
 'Taylor–Green_vortex',
 'List_of_fluid_flows_named_after_people',
 'Görtler_vortices',
 'Vortex_breaker',
 'Barber–Layden–Power_effect',
 'Magnetic_Prandtl_number',
 'Squat_effect',
 'Vortex_lattice_method',
 'Hele-Shaw_flow',
 'Pipe_flow',
 'Moody_chart',
 'Air_classifier',
 'Aerodynamic_potential-flow_code',
 'Lagrangian_analysis',
 'Jiggle_syphon',
 'Coolfluid',
 'Chézy_formula',
 'Stokes_stream_function',
 'Stokes_boundary_layer',
 'Annular_velocity',
 'Apparent_viscosity',
 'Keulegan–Carpenter_number',
 'Sommerfeld_number',
 'Ford_viscosity_cup',
 'Rouse_number',
 'Herschel–Bulkley_fluid',
 'Tollmien–Schlichting_wave',
 'Basset_force',
 "Luke's_variational_principle",
 'Atomizer_nozzle',
 'Emerson_Cavitation_Tunnel',
 'Ursell_number',
 'Hadamard–Rybczynski_equation',
 'Oseen_equations',
 'Pumplinx',
 'SIC_Processing',
 'Laplace_pressure',
 'Borda–Carnot_equation',
 'Depth–slope_product',
 'Biorheology',
 'Knudsen_layer',
 'Capillary_condensation',
 'Knudsen_flow',
 'Volumetric_flux',
 'Bridge_scour',
 'Coriolis–Stokes_force',
 'Morison_equation',
 'Hydrodynamic_stability',
 'River_mouth',
 'Otto_Laporte_Award',
 'Turbulent_diffusion',
 'Cnoidal_wave',
 'Vortex_stretching',
 'Taylor_microscale',
 'Double_diffusive_convection',
 'Volute_(pump)',
 'Venturi_flume',
 'Laminar-turbulent_transition',
 'Reynolds_number',
 'Energy_Manufacturing_Co._Inc',
 'Potential_flow_around_a_circular_cylinder',
 'Shock_polar',
 'Rushton_turbine',
 'Food_rheology',
 'Photon_bubble',
 'Current_(fluid)',
 'Batchelor_scale',
 'Nanofluidic_circuitry',
 'Lagrangian_coherent_structure',
 'Supersonic_airfoils',
 'Hartmann_number',
 'Shields_parameter',
 'Discharge_coefficient',
 'Thermal_bar',
 'Markstein_number',
 'Patch_dynamics_(physics)',
 'Epicyclic_frequency',
 'Gravity_feed',
 'Infragravity_wave',
 'Scallop_theorem',
 'Capillary_action_through_synthetic_mesh',
 'Similarity_solution',
 'Contour_advection',
 'Chaotic_mixing',
 'Trajectory_(fluid_mechanics)',
 'Flow_focusing',
 'Dynamic_fluid_film_equations',
 'Electrostatic_fluid_accelerator',
 'Specific_fan_power',
 'Milne-Thomson_circle_theorem',
 'Geophysical_fluid_dynamics',
 'Taylor–Goldstein_equation',
 'Reynolds_operator',
 'Dimensionless_Specific_Energy_Diagrams_for_Open_Channel_Flow',
 'Float_(liquid_level)',
 'An_Album_of_Fluid_Motion',
 'Optofluidics',
 "Jurin's_law",
 'Blake_number',
 'Pulsatile_flow',
 'Cell-free_marginal_layer_model',
 'Atomization_and_Sprays',
 'Dynamic_similarity_(Reynolds_and_Womersley_numbers)',
 'Flow_conditioning',
 "Euler's_pump_and_turbine_equation",
 'Bernoulli_grip',
 'Particle-laden_flows',
 'Bauer-Nilsen',
 "Crocco's_theorem",
 'Standard_litre_per_minute',
 'Peregrine_soliton',
 'Slip_ratio_(gas–liquid_flow)',
 'Image-based_flow_visualization',
 'Streamlet_(scientific_visualization)',
 'Flow_waveform',
 'Hydrometeor_loading',
 'Momentum-depth_relationship_in_a_rectangular_channel',
 'Kinematic_wave',
 "Bagnold's_fluid",
 'Center_of_lateral_resistance',
 'Knudsen_equation',
 'Stuart_number',
 'SESAM_(FEM)',
 'Bell_mouth',
 'Air-mixing_plenum',
 'Most_efficient_section',
 'Characteristic_number_(fluid_dynamics)',
 'Q-Vectors',
 'Lorentz_force_velocimetry',
 'Superfluidity',
 'Flow_control_(fluid)',
 'Segré–Silberberg_effect',
 'Helmholtz_flow',
 'Vortex_sheet',
 'Peniche_(fluid_dynamics)',
 'Langmuir_Turbulence',
 'Immersion_chiller',
 'SRM_Engine_Suite',
 'Blade_element_momentum_theory',
 'Capillary_flow_porometry',
 'Reduced_frequency',
 "Ladyzhenskaya's_inequality",
 'Flow_distribution_in_manifolds',
 'Rotational_viscosity',
 'Palinstrophy',
 'Topological_fluid_dynamics',
 'Gravity_Current_Intrusion',
 "Stokes'_paradox",
 'Sandia_method',
 'Different_types_of_boundary_conditions_in_fluid_dynamics',
 'Darwin_drift',
 'Electrodynamic_droplet_deformation',
 'Constant_viscosity_elastic_fluid',
 'Elasto-capillarity',
 'Drop_impact',
 'Flotation_of_flexible_objects',
 'Recoil_(fluid_behavior)',
 'Burgers_vortex',
 'Discretization_of_Navier–Stokes_equations',
 'Kapitza_number',
 'Slug_flow',
 'Two-dimensional_flow',
 'Compressible_duct_flow',
 'Prandtl_condition',
 'Entrance_length',
 'Liquid_droplet_radiator',
 'Supersonic_flow_over_a_flat_plate',
 'Induced-charge_electrokinetics',
 'Stratified_flow',
 'Optoelectrowetting',
 'Stokes_approximation_and_artificial_time',
 'Skin_friction_drag',
 'SCCM_(flow_unit)',
 'Squirmer',
 'Multiscale_turbulence',
 'Mud_cake_(oil_and_gas)',
 'Darrieus–Landau_instability',
 'Modified_pressure',
 'Dimensionless_numbers_in_fluid_mechanics',
 'List_of_optofluidics_researchers',
 'Advanced_Simulation_Library',
 'Flow_cups',
 'Laser_schlieren_deflectometry',
 'Rivlin–Ericksen_tensor',
 'Astrophysical_fluid_dynamics',
 'Arnold–Beltrami–Childress_flow',
 'Pressure-driven_flow',
 'Leray_projection',
 "Squire's_theorem",
 'Rise_in_Core',
 'Invasion_percolation',
 'Knudsen_paradox',
 'Eulerian_coherent_structure',
 'R_J_Mitchell_Wind_Tunnel']
In [119]:
import json
written = 0
with open('data\wiki_tokenized.json', 'r') as readfile:
    with open('data\physchemx.json', 'w') as writefile:
        for line in readfile:
            d = json.loads(line)
            if d['title'] in arts:
                written+= 1
                d['category'] = arts[d['title']]
                writeline = json.dumps(d)
                writefile.write(writeline + '\n')
print("written", written, "articles")
written 1920 articles

5 class single label classifier

In [127]:
import pandas as pd
data = pd.read_json('data\physchemx.json', lines = True)
data.head()
Out[127]:
category id section text title
0 [Acoustics] 1198 Abstract,History,History,History,History,Histo... Acoustics is the interdisciplinary science tha... Acoustics
1 [Fluid_dynamics, Acoustics] 1234 Abstract,Derivation of the governing equations... Acoustic theory is a scientific field that rel... Acoustic_theory
2 [Condensed_matter_physics] 4474 Abstract,History,History,History,History,Criti... A Bose–Einstein condensate (BEC) is a state_of... Bose–Einstein_condensate
3 [Quantum_mechanics] 4542 Abstract,Usage,Usage,Usage,Usage,Usage,Usage,V... In quantum_mechanics, bra–ket notation is a st... Bra–ket_notation
4 [Condensed_matter_physics] 5346 Abstract,Classification,Classification,Classif... A colloid, in chemistry, is a mixture in which... Colloid
In [128]:
titles = data['title']
X_raw = data['text']
y_raw = data['category']
y = [y[0] for y in y_raw] #keep only 1st category if there are multiple categories for a title

The function text_process removes punctuation and stop-words. This will be passed to the count-vectorizer from sklearn.

In [129]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
punc_dict = dict((s,1) for s in string.punctuation)
stop_dict = dict((c,1) for c in STOP_WORDS)
stop_dict['\n\n'] = 1
stop_dict['\n'] = 1
punc_dict.pop('_', None)
#make all lowercase, remove punctuation and stop words
def text_process(text, punc_dict= punc_dict, stop_dict=stop_dict):
    nopunc = ""
    text = text.lower()
    for char in text:
        if char not in punc_dict:
            nopunc += char
    processed = [word for word in nopunc.split(" ") if word not in stop_dict]
    return processed

Let's see what it does. It creates a list of words - tokens for each text that is passed to it.

In [131]:
text_process(X_raw[1])
Out[131]:
['acoustic',
 'theory',
 'scientific',
 'field',
 'relates',
 'description',
 'soundlongitudinal_and_transverse_waves',
 'derives',
 'fluid_dynamics',
 'acoustics',
 'engineering',
 'approach',
 'propagation',
 'sound',
 'waves',
 'fluid',
 'water',
 'modeled',
 'equation',
 'continuity',
 'conservation',
 'mass',
 'equation',
 'motion',
 'conservation',
 'momentum',
 '',
 'simplifications',
 'particular',
 'constant',
 'density',
 'given',
 'follows',
 '',
 'formula_1',
 'formula_2',
 'acoustic',
 'pressure',
 'formula_3',
 'flow_velocity',
 'vector',
 'formula_4',
 'vector',
 'spatial',
 'coordinates',
 'formula_5',
 'formula_6',
 'time',
 'formula_7',
 'static',
 'mass',
 'density',
 'medium',
 'formula_8',
 'bulk_modulus',
 'medium',
 'bulk',
 'modulus',
 'expressed',
 'terms',
 'density',
 'speed',
 'sound',
 'medium',
 'formula_9',
 '',
 'formula_10',
 'flow',
 'velocity',
 'field',
 'irrotational',
 'formula_11',
 'acoustic_wave_equation',
 'combination',
 'sets',
 'balance',
 'equations',
 'expressed',
 'formula_12',
 'vector_laplacian',
 'formula_13',
 'acoustic',
 'wave',
 'equation',
 'mass',
 'momentum',
 'balance',
 'equations',
 'expressed',
 'terms',
 'scalar_potential',
 'formula_14',
 'formula_15',
 'case',
 'acoustic',
 'wave',
 'equation',
 'written',
 'formula_16',
 'momentum',
 'balance',
 'mass',
 'balance',
 'expressed',
 'formula_17',
 'derivations',
 'equations',
 'waves',
 'acoustic',
 'medium',
 'given',
 'equations',
 'derivation_of_the_navier–stokes_equationsgeneral_form_of_the_navier–stokes_equations',
 'fluid',
 'medium',
 'formula_18',
 'formula_19',
 'body',
 'force',
 'unit',
 'mass',
 'formula_20',
 'pressure',
 'formula_21',
 'cauchy_stress_tensorstress_deviator_tensor',
 'formula_22',
 'cauchy_stress_tensor',
 'formula_23',
 'formula_24',
 'rank2',
 'identity',
 'tensor',
 'assumptions',
 'derive',
 'momentum',
 'balance',
 'equation',
 'acoustic',
 'medium',
 'assumptions',
 'resulting',
 'forms',
 'momentum',
 'equations',
 'outlined',
 'acoustics',
 'fluid',
 'medium',
 'assumed',
 'newtonian_fluid',
 'newtonian',
 'fluid',
 'deviatoric',
 'stress',
 'tensor',
 'related',
 'flow',
 'velocity',
 'formula_25',
 'formula_26',
 'shear',
 'viscosity',
 'formula_27',
 'bulk_viscosity',
 'divergence',
 'formula_21',
 'given',
 'formula_29',
 'identity',
 'formula_30',
 'formula_31',
 'equations',
 'conservation',
 'momentum',
 'written',
 'formula_32',
 'acoustics',
 'problems',
 'assume',
 'flow',
 'irrotational',
 'vorticity',
 'zero',
 'case',
 'formula_33',
 'momentum',
 'equation',
 'reduces',
 'formula_34',
 'frequently',
 'assumption',
 'effect',
 'body',
 'forces',
 'fluid',
 'medium',
 'negligible',
 'momentum',
 'equation',
 'simplifies',
 'formula_35',
 'additionally',
 'assume',
 'viscous',
 'forces',
 'medium',
 'bulk',
 'shear',
 'viscosities',
 'zero',
 'momentum',
 'equation',
 'takes',
 'form',
 'formula_36',
 'important',
 'simplifying',
 'assumption',
 'acoustic',
 'waves',
 'amplitude',
 'disturbance',
 'field',
 'quantities',
 'small',
 'assumption',
 'leads',
 'linear',
 'small',
 'signal',
 'acoustic',
 'wave',
 'equation',
 'express',
 'variables',
 'sum',
 'time',
 'averaged',
 'mean',
 'field',
 'formula_37',
 'varies',
 'space',
 'small',
 'fluctuating',
 'field',
 'formula_38',
 'varies',
 'space',
 'time',
 'formula_39',
 'formula_40',
 'momentum',
 'equation',
 'expressed',
 'formula_41',
 'fluctuations',
 'assumed',
 'small',
 'products',
 'fluctuation',
 'terms',
 'neglected',
 'order',
 'formula_42',
 'assume',
 'medium',
 'homogeneous',
 'sense',
 'time',
 'averaged',
 'variables',
 'formula_43',
 'formula_44',
 'zero',
 'gradients',
 'ie',
 'formula_45',
 'momentum',
 'equation',
 'formula_46',
 'stage',
 'assume',
 'medium',
 'rest',
 'implies',
 'mean',
 'flow',
 'velocity',
 'zero',
 'ie',
 'formula_47',
 'balance',
 'momentum',
 'reduces',
 'formula_48',
 'dropping',
 'tildes',
 'formula_49',
 'commonly',
 'form',
 'acoustic',
 'momentum',
 'equation',
 'formula_50',
 'equation',
 'derivation_of_the_navier–stokes_equationsconservation_of_mass',
 'fluid',
 'volume',
 'mass',
 'sources',
 'sinks',
 'given',
 'formula_51',
 'formula_52',
 'mass',
 'density',
 'fluid',
 'formula_53',
 'flow',
 'velocity',
 'equation',
 'conservation',
 'mass',
 'acoustic',
 'medium',
 'derived',
 'manner',
 'similar',
 'conservation',
 'momentum',
 'assumption',
 'small',
 'disturbances',
 'formula_39',
 'formula_40',
 'mass',
 'balance',
 'equation',
 'written',
 'formula_56',
 'neglect',
 'higher',
 'order',
 'terms',
 'fluctuations',
 'mass',
 'balance',
 'equation',
 'formula_57',
 'assume',
 'medium',
 'homogeneous',
 'ie',
 'formula_58',
 'mass',
 'balance',
 'equation',
 'takes',
 'form',
 'formula_59',
 'stage',
 'assume',
 'medium',
 'rest',
 'ie',
 'formula_47',
 'mass',
 'balance',
 'equation',
 'expressed',
 'formula_61',
 'close',
 'system',
 'equations',
 'need',
 'equation_of_state',
 'pressure',
 'assume',
 'medium',
 'ideal_gas',
 'acoustic',
 'waves',
 'compress',
 'medium',
 'adiabatic',
 'reversible_process_thermodynamics',
 'manner',
 'equation',
 'state',
 'expressed',
 'form',
 'differential',
 'equation',
 'formula_62',
 'formula_63',
 'specific_heat',
 'constant',
 'pressure',
 'formula_64',
 'specific_heat',
 'constant',
 'volume',
 'formula_65',
 'wave',
 'speed',
 'value',
 'formula_66',
 '14',
 'acoustic',
 'medium',
 'air',
 'small',
 'disturbances',
 'formula_67',
 'formula_9',
 'speed_of_sound',
 'medium',
 'formula_69',
 'balance',
 'mass',
 'written',
 'formula_70',
 'dropping',
 'tildes',
 'defining',
 'formula_71',
 'gives',
 'commonly',
 'expression',
 'balance',
 'mass',
 'acoustic',
 'medium',
 'formula_72',
 'use',
 'cylindrical_coordinate_system',
 'formula_73',
 'basis_vectors',
 'formula_74',
 'gradient',
 'formula_20',
 'divergence',
 'formula_76',
 'given',
 'formula_77',
 'flow_velocity',
 'expressed',
 'formula_78',
 'equations',
 'conservation_of_momentum',
 'written',
 'formula_79',
 'terms',
 'components',
 'equations',
 'conservation_of_momentum',
 'cylindrical_coordinates',
 'formula_80',
 'equation',
 'conservation_of_mass',
 'similarly',
 'written',
 'cylindrical_coordinates',
 'formula_81',
 'acoustic',
 'equations',
 'conservation_of_momentum',
 'conservation_of_mass',
 'expressed',
 'time',
 'harmonic',
 'form',
 'fixed',
 'frequency',
 'case',
 'pressures',
 'flow',
 'velocity',
 'assumed',
 'time',
 'harmonic',
 'functions',
 'form',
 'formula_82',
 'formula_83',
 'frequency',
 'substitution',
 'expressions',
 'governing',
 'equations',
 'cylindrical_coordinates',
 'gives',
 'fixed',
 'frequency',
 'form',
 'conservation_of_momentum',
 'formula_84',
 'fixed',
 'frequency',
 'form',
 'conservation_of_mass',
 'formula_85',
 'special',
 'case',
 'field',
 'quantities',
 'independent',
 'zcoordinate',
 'eliminate',
 'formula_86',
 'formula_87',
 'assuming',
 'solution',
 'equation',
 'written',
 'formula_88',
 'write',
 'partial',
 'differential',
 'equation',
 'formula_89',
 'left',
 'hand',
 'function',
 'formula_90',
 'right',
 'hand',
 'function',
 'formula_91',
 'formula_92',
 'formula_93',
 'constant',
 'substitution',
 'formula_94',
 'formula_95',
 'equation',
 'left',
 'bessel_equation',
 'general',
 'solution',
 'formula_96',
 'formula_97',
 'cylindrical',
 'bessel_function',
 'kind',
 'formula_98',
 'undetermined',
 'constants',
 'equation',
 'right',
 'general',
 'solution',
 'formula_99',
 'formula_100',
 'undetermined',
 'constants',
 'solution',
 'acoustic',
 'wave',
 'equation',
 'formula_101',
 'boundary',
 'conditions',
 'needed',
 'stage',
 'determine',
 'formula_102',
 'undetermined',
 'constants']

Split data into test and train sets

In [132]:
from sklearn.model_selection import train_test_split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw,y, test_size = 0.3, random_state = 101)

Vectorize using sklearn

In [134]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = text_process)
vector_fit = vectorizer.fit(X_raw)
In [135]:
X = vector_fit.transform(X_raw)
X_train = vector_fit.transform(X_train_raw)
X_test = vector_fit.transform(X_test_raw)

Check the sparsity

In [136]:
density= X.nnz/X.shape[0]/X.shape[1]*100
print(density)
0.37775081116238585

Naive Bayes classifier

In [139]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
fit = model.fit(X_train, y_train)
predict = model.predict(X_test)
In [140]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))
[[ 67   2   7   2   0]
 [  1  85   3   0  15]
 [  1   4 158   1   1]
 [  0   0   0  83   0]
 [  0  20   0   1 125]]
                          precision    recall  f1-score   support

               Acoustics       0.97      0.86      0.91        78
Condensed_matter_physics       0.77      0.82      0.79       104
          Fluid_dynamics       0.94      0.96      0.95       165
         Limestone_caves       0.95      1.00      0.98        83
       Quantum_mechanics       0.89      0.86      0.87       146

             avg / total       0.90      0.90      0.90       576

A simple Naive Bayes classifier is able to do pretty well on identifying the correct category for each article. The precision is very high for classes with no overlaps, while quantum mechanics and condensed matter physics have relatively lower precision. This is in-fact related to noisyness of the data itself. Condensed_matter_physics and Quantum_mechanics share 17 articles in common, but we forced only one of those categories for each of those articles - this introduced noise in the training data itself.

Broader classification

Problem description

Let's look at the list of top 100 categories with the most articles. Recall that this list of articles was obtained by traversing the wikipedia graph starting at the categories 'materials' and traversing up to depth 5, with the motive of getting a broad range of articles about materials science and related topics such as chemistry, engineering and life-science.

However, looking at the top 100 categories,

  1. a huge number of them seem to have nothing to do with materials science or related topics! For example 'Living_people', American_films.
  2. A lot of articles belong to very narrow categories such as 'phenol_ethers' or 'transcription_factors'. For future categorization, we would like to use a broader set of categories.

Thus the problem statement is to do an unsupervised clustering of these categories.

In [143]:
import pandas as pd
cats_df = pd.read_csv('data\cats_articles')
cats_df.head()
Out[143]:
category articles n_articles
0 Living_people ['Anatoly_Karpov', 'Anita_Hill', 'Dale_Chihuly... 3536
1 American_films ['The_Birth_of_a_Nation', 'King_Kong_(1933_fil... 3438
2 American_black-and-white_films ['The_Birth_of_a_Nation', 'King_Kong_(1933_fil... 3036
3 Enzymes_of_unknown_structure ['L-lactate_dehydrogenase_(cytochrome)', '1,4-... 2087
4 English-language_films ['King_Kong_(1933_film)', 'U-571_(film)', 'Cro... 1964
In [144]:
cats_df[:100]
Out[144]:
category articles n_articles
0 Living_people ['Anatoly_Karpov', 'Anita_Hill', 'Dale_Chihuly... 3536
1 American_films ['The_Birth_of_a_Nation', 'King_Kong_(1933_fil... 3438
2 American_black-and-white_films ['The_Birth_of_a_Nation', 'King_Kong_(1933_fil... 3036
3 Enzymes_of_unknown_structure ['L-lactate_dehydrogenase_(cytochrome)', '1,4-... 2087
4 English-language_films ['King_Kong_(1933_film)', 'U-571_(film)', 'Cro... 1964
5 Debut_novels ['Death_of_a_Hero', 'Neuromancer', 'Sense_and_... 1339
6 Articles_containing_video_clips ['Apollo_11', 'Alkali_metal', 'Atomic_orbital'... 1125
7 Films_made_before_the_MPAA_Production_Code ['King_Kong_(1933_film)', 'It_Happened_One_Nig... 1101
8 HarperCollins_books ['Microserfs', 'Coraline', 'The_Sweet_Hereafte... 1073
9 American_drama_films ['The_Birth_of_a_Nation', 'The_Lost_Weekend_(f... 1010
10 Enzymes_of_known_structure ['Beta-lactamase', 'Beta-galactosidase', 'Coen... 926
11 Phenol_ethers ['Heroin', 'Hydrocodone', 'Quinine', 'Sildenaf... 898
12 Chloroarenes ['Ketamine', 'Clozapine', 'Polychlorinated_bip... 737
13 Transcription_factors ['Transcription_factor', 'Homeobox', 'FOX_prot... 726
14 Articles_created_via_the_Article_Wizard ['Brookite', 'Circumstellar_habitable_zone', '... 692
15 English-language_books ["Darwin's_Dangerous_Idea", 'On_the_Origin_of_... 682
16 Proteins ['Histone', 'Prion', 'Protein', 'Proteome', 'P... 663
17 Alcohols ['Alcohol', 'Erythromycin', 'Hydroxy_group', '... 661
18 Film_noir ['Film_noir', 'Crossfire_(film)', 'The_Lost_We... 655
19 American_science_fiction_novels ['Neuromancer', 'The_Three_Stigmata_of_Palmer_... 654
20 Doubleday_(publisher)_books ['The_Three_Stigmata_of_Palmer_Eldritch', 'A_S... 651
21 Deaths_by_drowning ['Harold_Holt', 'Le_Corbusier', 'Pavel_Urysohn... 621
22 Medicinal_plants ['Acacia_sensu_lato', 'Nepeta', 'Celery', 'Dil... 618
23 Organofluorides ['Trifluoperazine', 'Halothane', 'Haloperidol'... 561
24 American_silent_short_films ['A_Corner_in_Wheat', 'Plane_Crazy', 'Presiden... 557
25 Multilingual_films ['One_Hour_with_You', 'The_Big_House_(1930_fil... 549
26 Plants_described_in_1753 ['Almond', 'Chives', 'Dill', 'Marrubium_vulgar... 533
27 Tor_Books_books ['A_Fire_Upon_the_Deep', 'Speaker_for_the_Dead... 532
28 Fluid_dynamics ['Acoustic_theory', 'Cavitation', 'Fluid', 'Fl... 529
29 American_novels_adapted_into_films ['Fahrenheit_451', 'Gone_with_the_Wind_(novel)... 512
... ... ... ...
70 1932_films ['Grand_Hotel_(1932_film)', 'One_Hour_with_You... 339
71 Hydrology ['Hydrology', 'Drought', 'Oceanography', 'Aqui... 337
72 Paramount_Pictures_films ['The_Lost_Weekend_(film)', 'Animal_Crackers_(... 334
73 1930_films ['Animal_Crackers_(1930_film)', 'The_Big_House... 328
74 Lactams ['Beta-lactam', 'Flunitrazepam', 'Sildenafil',... 327
75 Limestone_caves ['Wookey_Hole_Caves', 'Mammoth_Cave_National_P... 323
76 Protein_families ['Collagen_helix', 'G_protein–coupled_receptor... 319
77 Luthiers ['Amati', 'Antonio_Stradivari', 'François_Tour... 318
78 1933_films ['King_Kong_(1933_film)', 'She_Done_Him_Wrong'... 314
79 Hodder_&_Stoughton_books ['The_IPCRESS_File', 'Thrones,_Dominations', '... 314
80 Faber_and_Faber_books ['Lord_of_the_Flies', "Old_Possum's_Book_of_Pr... 313
81 Films_based_on_American_novels ['The_Birth_of_a_Nation', 'Crossfire_(film)', ... 311
82 Piperazines ['Ciprofloxacin', 'Sildenafil', 'Clozapine', '... 310
83 Pyridines ['Pyridine', 'Nicotine', 'Triprolidine', 'Omep... 310
84 Novels_first_published_in_serial_form ['All_Quiet_on_the_Western_Front', 'Children_o... 308
85 English-language_journals ['Economic_Geology_(journal)', 'Journal_of_Flu... 307
86 1930s_comedy_films ['Animal_Crackers_(1930_film)', 'The_Thin_Man_... 304
87 British_novels_adapted_into_films ['A_Clockwork_Orange_(novel)', 'Hercule_Poirot... 304
88 Clusters_of_differentiation ['CD32', 'Cluster_of_differentiation', 'Insuli... 301
89 American_young_adult_novels ['Have_Space_Suit—Will_Travel', 'The_Dana_Girl... 295
90 Heinemann_(publisher)_books ['A_Clockwork_Orange_(novel)', 'The_Time_Machi... 289
91 Enzymes ['DNA_ligase', 'Enzyme', 'Polymerase', 'Protei... 288
92 Ethers ['Ether', 'Erythromycin', 'Morphine', 'Sodium_... 284
93 British_films ['The_Private_Life_of_Henry_VIII', 'The_Third_... 284
94 Alkenes ['Alkene', 'Ethylene', 'Thebaine', 'Triprolidi... 283
95 Orthorhombic_minerals ['Analcime', 'Sulfur', 'Topaz', 'Prehnite', 'O... 283
96 1920s_drama_films ['Wings_(1927_film)', 'The_Racket_(1928_film)'... 279
97 Carboxylic_acids ['Amoxicillin', 'Carboxylic_acid', 'PCAA', 'Pa... 277
98 Fellows_of_the_Royal_Society ['Alan_Turing', 'Edward_Jenner', 'Frederick_Ab... 276
99 EC_1.1.1 ['Alcohol_dehydrogenase', 'Malate_dehydrogenas... 275

100 rows × 3 columns

Approach

Choice of features:

  1. bag of words feature vector for each document
  2. topic vectors as features for each document. The topic vectors are obtained using LDA topic modeling
  3. doc2vec vectors as feature for each document.

Choice of clustering algorithms

  1. k-means clustering

Topic Modeling with Latent Dirichlet Allocation

In [1]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import LineSentence

import pyLDAvis
import pyLDAvis.gensim
import warnings
import os
import numpy as np
import _pickle as pickle
import json

from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_numeric

from spacy.lang.en.stop_words import STOP_WORDS
stop_dict = dict((c,1) for c in STOP_WORDS)
stop_dict['\n\n'] = 1
stop_dict['\n'] = 1
C:\Users\Astha\Anaconda2\envs\tensorflow\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

Text normalization and creating a dictionary

In [91]:
import time
t0 = time.time()
corpus = []
dct = Dictionary(corpus)
with open('data/wiki_tokenized.json', 'r') as readfile:
    with open('data/wiki_nostem.json', 'w') as writefile:
        for i, line in enumerate(readfile):
            if 5==5:
                d = json.loads(line)
                text = strip_punctuation(strip_numeric(d['text']))
                text = " ".join([word for word in text.split(" ") if word not in stop_dict])
                d['text'] = text
                writeline = json.dumps(d)
                writefile.write(writeline + '\n')
                text = text.split(" ")
                
                text = np.array(text).reshape(1,-1)
                dct.add_documents(text)
                if i%10000 == 0 and i!=0:
                    print(i, " records processed. Length of dict = ", len(dct) )
                    dct.filter_extremes(keep_n=800000)
                    dct.compactify()
                    print("filtered to length ", len(dct))

dct.filter_extremes(keep_n=800000)
dct.compactify()
t= time.time()
print(t-t0)
gensim_dict_filepath = os.path.join("data",'gensim_dict_nostem.dict')
#dct.save_as_text(gensim_dict_filepath)
dct.save(gensim_dict_filepath)
10000  records processed. Length of dict =  367411
filtered to length  77869
20000  records processed. Length of dict =  279278
filtered to length  81451
30000  records processed. Length of dict =  246511
filtered to length  82826
40000  records processed. Length of dict =  217731
filtered to length  83515
50000  records processed. Length of dict =  211192
filtered to length  84254
60000  records processed. Length of dict =  210922
filtered to length  84829
70000  records processed. Length of dict =  202569
filtered to length  85421
80000  records processed. Length of dict =  173796
filtered to length  86079
90000  records processed. Length of dict =  188285
filtered to length  86499
100000  records processed. Length of dict =  206546
filtered to length  87115
110000  records processed. Length of dict =  198681
filtered to length  87565
120000  records processed. Length of dict =  201271
filtered to length  88013
130000  records processed. Length of dict =  197962
filtered to length  88583
140000  records processed. Length of dict =  194764
filtered to length  89043
150000  records processed. Length of dict =  198108
filtered to length  89583
352.3198137283325
In [2]:
gensim_dict_filepath = os.path.join("data",'gensim_dict_nostem.dict')
#dct = Dictionary.load_from_text(gensim_dict_filepath)
dct = Dictionary.load(gensim_dict_filepath)

The LDA implementation below is based on a tutorial by Patrick Harrison - https://youtu.be/6zm9NC9uRkk, accompanied by a Github notebook - https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

In [3]:
bow_filepath = os.path.join('data','bow_corpus.mm')
def bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    with open(filepath, 'r') as readfile:
        if 0==0:
            for line in readfile:
                d = json.loads(line)
                text = d['text'].split()
                yield dct.doc2bow(text)

    #for review in LineSentence(filepath):
        #yield trigram_dictionary.doc2bow(review)
In [5]:
%%time

if 0 == 1:

    # generate bag-of-words representations for
    # all text and save them as a matrix
    MmCorpus.serialize(bow_filepath,
                       bow_generator('data/wiki_nostem.json'))
    
# load the finished bag-of-words corpus from disk
bow_corpus = MmCorpus(bow_filepath)
Wall time: 47.1 ms
In [6]:
%%time

lda_filepath = os.path.join('data', 'lda')
if 1 == 0:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(bow_corpus,
                           num_topics=100,
                           id2word=dct,
                           workers=4)
    
    lda.save(lda_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_filepath)
Wall time: 1.59 s
In [7]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print( u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print (u'{:20} {:.3f}'.format(term, round(frequency, 5)))
In [8]:
explore_topic(topic_number=28)
term                 frequency

s                    0.024
The                  0.016
He                   0.007
She                  0.005
In                   0.005
novel                0.004
book                 0.004
father               0.003
time                 0.003
It                   0.003
family               0.003
A                    0.003
story                0.003
life                 0.003
I                    0.003
death                0.002
years                0.002
later                0.002
–                    0.002
son                  0.002
After                0.002
mother               0.002
published            0.002
man                  0.002
work                 0.002
In [13]:
%%time

LDAvis_data_filepath = os.path.join('data', 'ldavis_prepared')
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus,
                                              dct)

    #with open(LDAvis_data_filepath, 'wb') as f:
        #pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
#with open(LDAvis_data_filepath) as f:
    #LDAvis_prepared = pickle.load(f)
C:\Users\Astha\Anaconda2\envs\tensorflow\lib\site-packages\pyLDAvis\_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
Wall time: 17min 8s
In [15]:
pyLDAvis.display(LDAvis_prepared)
Out[15]:

Looking at the visualization above, topic modeling seems like a very promising approach in separating out the articles. Specifically, in the visualization below, image.png topics on the left are more social science, politics, literature related while topics on the right are primarily talk about more sciency articles. There are a large number of small topics in the center, these are harder to classify in one of the two categories - science or social science.